Categorical Outcome Modeling (Tree Fitting)

Preparing Workspace

Load necessary packages

library(caret) # Classification and Regression Training
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
library(corrplot)
## corrplot 0.84 loaded
library(dplyr) # A Grammar of Data Manipulations
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats) # Tools for Working with Categorical Variables (Factors)
## Warning: package 'forcats' was built under R version 3.5.2
library(gdata) # Various R Programming Tools for Data Manipulation
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(ggplot2) # Create Elegant Data Visualizations Using the Grammar of Graphics
library(ggrepel) # Automatically Position Non-Overlapping Text Labels with ‘ggplot2’
## Warning: package 'ggrepel' was built under R version 3.5.2
library(ggridges) # Ridgeline Plots in ‘ggplot2’
## 
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual
library(ggthemes) # Extra Themes, Scales, and Geoms for ‘ggplot2’
## Warning: package 'ggthemes' was built under R version 3.5.2
library(gmodels) # Various R Programming Tools for Model Fitting
library(grid) # The Grid Graphics Package
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:gdata':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(knitr) # A General-Purpose Package for Dynamic Report Generation in R
## Warning: package 'knitr' was built under R version 3.5.2
library(lmtest) # Testing Linear Regression Models
## Warning: package 'lmtest' was built under R version 3.5.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(plyr) # Tools for Splitting, Applying and Combining Data
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(purrr) # Functional Programming Tools
## Warning: package 'purrr' was built under R version 3.5.2
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:gdata':
## 
##     keep
## The following object is masked from 'package:caret':
## 
##     lift
library(readr) # Read Rectangular Text Data
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.2
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(scales) # Scale Functions for Visualization
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following object is masked from 'package:purrr':
## 
##     discard
library(shiny) # Web Application Framework for R
## Warning: package 'shiny' was built under R version 3.5.2
library(skimr) # Compact and Flexible Summaries of Data
## Warning: package 'skimr' was built under R version 3.5.2
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
## 
##     kable
## The following object is masked from 'package:stats':
## 
##     filter
library(stringr) # Simple, Consistent Wrappers for Common String Operations
library(tibble) # Simple Data Frames
## Warning: package 'tibble' was built under R version 3.5.2
library(tidyr) # Easily Tidy Data with ‘spread()’ and ‘gather()’ Functions
## Warning: package 'tidyr' was built under R version 3.5.2
library(tidyverse) # Install and Load ‘Tidyverse’ Packages
library(usmap)
## Warning: package 'usmap' was built under R version 3.5.2
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
## The following object is masked from 'package:plyr':
## 
##     ozone
library(mapdata)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggmap':
## 
##     theme_nothing
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library("googleway")
library("ggspatial") 
library("rnaturalearth") 
library("rnaturalearthdata")
theme_set(theme_bw())
library("sf")
## Warning: package 'sf' was built under R version 3.5.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(gridExtra)

Creating dfs of color schemes I may use

state_colors <- c("brown1","lightseagreen","goldenrod1","slateblue")
species_colors <- c("royalblue","salmon","darkmagenta")
pisaster_colors <- c("thistle1","plum","orchid3","darkmagenta")
RMSE_colors <- c("slategray1","steelblue3","thistle2","plum3")
bioreg_colours <- c("brown3","goldenrod1","darkolivegreen4","skyblue3","mediumpurple3","hotpink3")

Load cleaned data

#clean_data <- readRDS("./data/processed_data/processeddata.rds")
clean_data <- readRDS("../../data/processed_data/processeddata.rds")
skim(clean_data)
## Skim summary statistics
##  n obs: 13165 
##  n variables: 23 
## 
## ── Variable type:factor ───────────────────────────────────────────────
##               variable missing complete     n n_unique
##              bioregion       0    13165 13165        6
##              georegion       0    13165 13165        9
##             group_code       0    13165 13165        9
##  group_code_UCSC_other       0    13165 13165        2
##                 island       0    13165 13165        6
##     marine_season_code       0    13165 13165       57
##       marine_site_name       0    13165 13165       77
##            method_code       0    13165 13165        4
##   method_code_IP_other       0    13165 13165        2
##             mpa_region       0    13165 13165        6
##            season_name       0    13165 13165        4
##              site_code       0    13165 13165       77
##           species_code       0    13165 13165        3
##                  state       0    13165 13165        4
##                                              top_counts ordered
##   San: 5608, Oly: 4138, Gov: 2366, Sal: 652               FALSE
##   CA : 5382, CA : 2592, CA : 1660, OR: 1386               FALSE
##    UCS: 9657, UCL: 2032, PBN: 400, SSS: 366               FALSE
##                 UCS: 9657, Oth: 3508, NA: 0               FALSE
##    Mai: 12364, Bar: 279, Sad: 250, Hat: 150               FALSE
##      SU1: 440, SU1: 403, SU1: 392, FA1: 376               FALSE
##      Mil: 524, Sco: 513, Sti: 472, End: 468               FALSE
##      IP: 12012, BT2: 782, GSE: 336, TS3: 35               FALSE
##                 IP: 12012, Oth: 1153, NA: 0               FALSE
##  Cen: 5382, Sou: 2627, NUL: 1932, Nor: 1660               FALSE
##    Sum: 4552, Fal: 4489, Spr: 4112, Win: 12               FALSE
##      MCR: 524, SCT: 513, SWC: 472, END: 468               FALSE
##                                           P.o: 11416, K   FALSE
##       CA: 10548, OR: 1386, WA: 865, AK: 366               FALSE
## 
## ── Variable type:integer ──────────────────────────────────────────────
##              variable missing complete     n    mean      sd   p0  p25
##  marine_common_season       0    13165 13165  116.82   19.23   78  101
##    marine_common_year       0    13165 13165 2009.7     4.81 2000 2006
##     marine_sort_order       0    13165 13165 5566.24 1125.15 1720 5020
##       season_sequence       0    13165 13165    2.03    0.81    1    1
##              size_bin       0    13165 13165   82.44   42.2     5   50
##       size_sort_order       0    13165 13165    9.57    4.15    1    6
##                 total       0    13165 13165    9.92   16.43    1    2
##   p50  p75 p100     hist
##   118  131  152 ▅▅▆▇▇▇▇▅
##  2010 2013 2018 ▃▅▅▇▇▆▆▆
##  6090 6370 7650 ▁▁▁▂▃▃▇▁
##     2    3    4 ▇▁▇▁▁▇▁▁
##    80  110  320 ▅▇▇▃▁▁▁▁
##    10   12   33 ▅▇▇▃▁▁▁▁
##     4   11  360 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ──────────────────────────────────────────────
##   variable missing complete     n    mean   sd      p0     p25     p50
##   latitude       0    13165 13165   38.67 5.2    32.87   34.73   36.62
##  longitude       0    13165 13165 -122.25 2.86 -135.38 -123.98 -121.94
##      p75    p100     hist
##    41.65   57.05 ▇▆▃▂▁▁▁▁
##  -120.62 -117.25 ▁▁▁▁▅▇▆▃
names(clean_data)
##  [1] "group_code"            "site_code"            
##  [3] "marine_site_name"      "marine_sort_order"    
##  [5] "latitude"              "longitude"            
##  [7] "marine_common_season"  "marine_season_code"   
##  [9] "marine_common_year"    "season_sequence"      
## [11] "season_name"           "method_code"          
## [13] "species_code"          "size_sort_order"      
## [15] "size_bin"              "total"                
## [17] "mpa_region"            "georegion"            
## [19] "bioregion"             "island"               
## [21] "group_code_UCSC_other" "method_code_IP_other" 
## [23] "state"

Data Visualization

Visualizing outcome of interest (bioregion)

summary(clean_data$bioregion)
##    AK_2_BritColumb   ChannelIsl_South    GovtPt_2_Mexico 
##                366                 35               2366 
## OlyCstWA_2_SanFran       SalishSea_WA   SanFran_2_GovtPt 
##               4138                652               5608
bioregion_counts <- clean_data %>% mutate(bioregion = recode(bioregion, "AK_2_BritColumb"="AK to \n British \n Columbia","ChannelIsl_South"="Channel \n Islands \n South","SalishSea_WA"="Salish \n Sea, WA","OlyCstWA_2_SanFran"="Olympic \n Coast WA \n to SanFran","SanFran_2_GovtPt"="SanFran to \n Gov't Point","GovtPt_2_Mexico"="Gov't Point \n to Mexico"))
skim(bioregion_counts$bioregion)
## 
## Skim summary statistics
## 
## ── Variable type:factor ───────────────────────────────────────────────
##                    variable missing complete     n n_unique
##  bioregion_counts$bioregion       0    13165 13165        6
##                                 top_counts ordered
##  San: 5608, Oly: 4138, Gov: 2366, Sal: 652   FALSE
bioregion_counts_ordered <- bioregion_counts %>%
  mutate(bioregion = factor(bioregion, levels = c("AK to \n British \n Columbia","Salish \n Sea, WA","Olympic \n Coast WA \n to SanFran","SanFran to \n Gov't Point","Channel \n Islands \n South","Gov't Point \n to Mexico"))) %>%
  arrange(bioregion) 

Creating graphic to represent above results

bioregion_histogram <- bioregion_counts_ordered %>% ggplot() + geom_bar(aes(bioregion),fill=bioreg_colours) #+  theme(axis.text.x = element_text(angle=25, hjust=1))
bioregion_histogram 

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/1.bioregion_histogram.png",plot =bioregion_histogram , width = 5, height = 4)

Plotting numerical predictors

writing code that produces plots showing our outcome of interest on the x-axis and each numeric predictor on the y-axis.

bioregion_counts_ordered$season_sequence <- as.factor(bioregion_counts_ordered$season_sequence)
numer_data <- bioregion_counts_ordered %>% select_if(is.numeric) 

numer_data$bioregion <- bioregion_counts_ordered$bioregion

numer_data <- numer_data %>% dplyr::select(-size_sort_order)
numer_data_plot <- numer_data %>% gather(-bioregion, key = "var", value = "value") %>% 
            ggplot() + geom_point(aes(x = bioregion, y=value, col=bioregion)) + theme(axis.text.x = element_blank()) +
            facet_wrap(~ var, scales = "free") + theme_bw() + theme(legend.position = "none", axis.title.x = element_blank()) +  scale_color_manual(values=bioreg_colours)
print(numer_data_plot)

latitude_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=latitude, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Latitude") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#latitude_bioreg

longitude_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=longitude, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Longitude") +  theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#longitude_bioreg


season_chron_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=marine_common_season, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Season & Year") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#season_chron_bioreg

year_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=marine_common_year, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Year") +  theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#year_bioreg

site_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=marine_sort_order, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Sites (Ordered)") +  theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#site_bioreg

size_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=size_bin, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Size Class") +  theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#size_bioreg

abundance_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=log(total), col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("log(Spp. Abundance)") +  theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#abundance_bioreg

Creating graphic to represent above results

grid.arrange(latitude_bioreg,longitude_bioreg,site_bioreg,nrow=1,top="(A) Spatial Numeric Predictors")

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/2.spatial_num_pred.png",plot =grid.arrange(latitude_bioreg,longitude_bioreg,site_bioreg,nrow=1,top="(A) Spatial Numeric Predictors")
 , width = 5, height = 4)

grid.arrange(season_chron_bioreg,year_bioreg,nrow=1,top="(B) Temporal Numeric Predictors")

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/3.temporal_num_pred.png",plot =grid.arrange(season_chron_bioreg,year_bioreg,nrow=1,top="(B) Temporal Numeric Predictors"))
## Saving 7 x 5 in image
grid.arrange(abundance_bioreg,size_bioreg,nrow=1,top="(C) Spp.-Related Numeric Predictors")

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/4.spp.rel_num_pred.png",plot =grid.arrange(abundance_bioreg,size_bioreg,nrow=1,top="(C) Spp.-Related Numeric Predictors"))
## Saving 7 x 5 in image
#grid.arrange(latitude_bioreg,longitude_bioreg,season_chron_bioreg,year_bioreg,site_bioreg,size_bioreg,abundance_bioreg,nrow=3)

Plotting categorical predictors

categ_data <- bioregion_counts_ordered %>% select_if(is.factor) 
categ_data <- categ_data %>% dplyr::select(-marine_site_name,-season_sequence,-marine_season_code)
names(categ_data)
##  [1] "group_code"            "site_code"            
##  [3] "season_name"           "method_code"          
##  [5] "species_code"          "mpa_region"           
##  [7] "georegion"             "bioregion"            
##  [9] "island"                "group_code_UCSC_other"
## [11] "method_code_IP_other"  "state"
categ_data_plotting <- categ_data %>% dplyr::select(-method_code_IP_other,-group_code_UCSC_other)

groupcode_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=group_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Surveying Group") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#groupcode_bioreg

sitecode_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=site_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Site") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text.x = element_blank(),axis.title.x=element_blank(),axis.text.y = element_blank(),legend.position = "none",axis.title.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#sitecode_bioreg

season_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=season_name, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Season") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#season_bioreg

method_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=method_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Sampling Method") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#method_bioreg

species_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=species_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Species") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#species_bioreg

mpareg_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=mpa_region, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("MPA Region") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#mpareg_bioreg

georegion_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=georegion, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Georegion") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#georegion_bioreg

island_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=island, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Island Location?") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#island_bioreg

state_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=state, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("State") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#state_bioreg

Creating graphic to represent above results

grid.arrange(groupcode_bioreg,method_bioreg,species_bioreg,nrow=3,top="(A) Sample Info Categorical Predictors")

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/5.sampleinfo_cat_pred.png",plot =grid.arrange(groupcode_bioreg,method_bioreg,species_bioreg,nrow=3,top="(A) Sample Info Categorical Predictors"))
## Saving 7 x 5 in image
grid.arrange(state_bioreg,georegion_bioreg,sitecode_bioreg,nrow=3,top="(B) Spatial Categorical Predictors")

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/6.spatial_cat_pred.png",plot =grid.arrange(state_bioreg,georegion_bioreg,sitecode_bioreg,nrow=3,top="(B) Spatial Categorical Predictors"))
## Saving 7 x 5 in image
grid.arrange(season_bioreg,mpareg_bioreg,island_bioreg,nrow=3,top="(C) Ecological Categorical Predictors")

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/7.eco_cat_pred.png",plot =grid.arrange(season_bioreg,mpareg_bioreg,island_bioreg,nrow=3,top="(C) Ecological Categorical Predictors"))
## Saving 7 x 5 in image
#grid.arrange(groupcode_bioreg,sitecode_bioreg,season_bioreg,method_bioreg,species_bioreg,mpareg_bioreg,georegion_bioreg,island_bioreg,state_bioreg)

Modeling

Making sure Bioregion is first

#names(clean_data) # bioregion is 19, 23 total, get rid of 3 and 14
clean_data_final <- clean_data[c(19,1,2,4,5,6,7,8,9,10,11,12,13,15,16,17,18,20,21,22,23)]
names(clean_data_final) # double checking
##  [1] "bioregion"             "group_code"           
##  [3] "site_code"             "marine_sort_order"    
##  [5] "latitude"              "longitude"            
##  [7] "marine_common_season"  "marine_season_code"   
##  [9] "marine_common_year"    "season_sequence"      
## [11] "season_name"           "method_code"          
## [13] "species_code"          "size_bin"             
## [15] "total"                 "mpa_region"           
## [17] "georegion"             "island"               
## [19] "group_code_UCSC_other" "method_code_IP_other" 
## [21] "state"

Data splitting

#write code that splits data into 70/30 train/test
#call the 2 parts data_train and data_test
library(caret)
set.seed(123)
trainset_treefit <- caret::createDataPartition(y = clean_data_final$bioregion, p = 0.7, list = FALSE)
data_train_treefit = clean_data_final[trainset_treefit,] #extract observations/rows for training, assign to new variable
data_test_treefit = clean_data_final[-trainset_treefit,] #do the same for the test set

Model Fitting: Null Model

library("mlr")
## Warning: package 'mlr' was built under R version 3.5.2
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.5.2
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:gdata':
## 
##     resample
## The following object is masked from 'package:caret':
## 
##     train
null_accuracy_treefit <- measureACC("SanFran_2_GovtPt", clean_data_final$bioregion)
null_accuracy_treefit
## [1] 0.425978

Single predictor models

#There is probably a nicer tidyverse way of doing this. I just couldn't think of it, so did it this way.
set.seed(1111) #makes each code block reproducible
outcomename = "bioregion"
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5) #setting CV method for caret
Npred <- ncol(data_train_treefit)-1 # number of predictors
resultmat <- data.frame(Variable = names(data_train_treefit)[-1], Accuracy = rep(0,Npred)) #store performance for each variable
for (n in 2:ncol(data_train_treefit)) #loop over each predictor. For this to work, outcome must be in 1st column
{
  fit1 <- caret::train( as.formula(paste(outcomename, "~",names(data_train_treefit)[n])), data = data_train_treefit, method = "rpart", trControl = fitControl, na.action = na.pass, tuneLength = 10) 
resultmat[n-1,2]= max(fit1$results$Accuracy)  
}
print(resultmat)
##                 Variable  Accuracy
## 1             group_code 0.6853236
## 2              site_code 0.7972035
## 3      marine_sort_order 1.0000000
## 4               latitude 0.9997831
## 5              longitude 1.0000000
## 6   marine_common_season 0.7002923
## 7     marine_season_code 0.6245137
## 8     marine_common_year 0.4435607
## 9        season_sequence 0.6970392
## 10           season_name 0.6970383
## 11           method_code 0.5139393
## 12          species_code 0.4583143
## 13              size_bin 0.4284417
## 14                 total 0.4427816
## 15            mpa_region 0.9036776
## 16             georegion 0.9838376
## 17                island 0.4861698
## 18 group_code_UCSC_other 0.6034276
## 19  method_code_IP_other 0.5112258
## 20                 state 0.6080918

Creating graphic to represent above results

null_accuracy_treefit <- 0.426
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.2
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:skimr':
## 
##     kable
## The following object is masked from 'package:dplyr':
## 
##     group_rows
Variable <- c("size_bin","total","marine_common_year","species_code","island","method_code_IP_other","method_code","group_code_UCSC_other","state","marine_season_code","group_code","season_name","season_sequence","marine_common_season","site_code","mpa_region","georegion","latitude", "longitude","marine_sort_order")
Accuracy <- c(0.43,0.44,0.44,0.46,0.49,0.51,0.51,0.60,0.61,0.62,0.69,0.70,0.70,0.70,0.80,0.90,0.98,1.00,1.00,1.00)
Singlepred_df <- data.frame(Variable,Accuracy)
Singlepred_df
##                 Variable Accuracy
## 1               size_bin     0.43
## 2                  total     0.44
## 3     marine_common_year     0.44
## 4           species_code     0.46
## 5                 island     0.49
## 6   method_code_IP_other     0.51
## 7            method_code     0.51
## 8  group_code_UCSC_other     0.60
## 9                  state     0.61
## 10    marine_season_code     0.62
## 11            group_code     0.69
## 12           season_name     0.70
## 13       season_sequence     0.70
## 14  marine_common_season     0.70
## 15             site_code     0.80
## 16            mpa_region     0.90
## 17             georegion     0.98
## 18              latitude     1.00
## 19             longitude     1.00
## 20     marine_sort_order     1.00
Singlepred_df_ordered <- Singlepred_df %>%
  mutate(Variable = factor(Variable, levels = c("size_bin","total","marine_common_year","species_code","island","method_code_IP_other","method_code","group_code_UCSC_other","state","marine_season_code","group_code","season_name","season_sequence","marine_common_season","site_code","mpa_region","georegion","latitude", "longitude","marine_sort_order"))) %>%
  arrange(Variable)

# vertical line conditioning
lvls <- levels(Singlepred_df_ordered$Variable)
vline.level <- 'site_code' # you want to draw line here, right before 18

#geom_vline(aes(xintercept = myLoc))
# plotting
Single_Predictor_Accuracy <- Singlepred_df_ordered %>% ggplot(aes(x = Variable, y=Accuracy, label =Accuracy)) + geom_hline(yintercept=null_accuracy_treefit,color='red', size=2,linetype="dashed") + ylim(0.35,1) + geom_text(stat='identity',color="black", size=5,fill="lightgrey",angle=90) + theme(title = element_text(size=12), axis.title.y = element_text(size=14),axis.title.x = element_blank(),axis.text.y = element_text(size=12), axis.text.x = element_text(size=8,angle=40,hjust=1))
## Warning: Ignoring unknown parameters: fill
Single_Predictor_Accuracy

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/8.single_predictor_accuracy.png",plot = Single_Predictor_Accuracy, width = 6, height = 4)

Subsetting Data

So a lot of these have very strong (and in some cases, perfect, aka 100%) correlation with bioregion, and for very obvious reasons. I removed variables in 2 ways: one was be more conservative (aka I removed those with above a 90% accuracy- calling it data_subset_A) and one will be less conservative (aka I removed those with above a 70% accuracy- calling it data_subset_B). I ended up deciding subset B would be the most useful moving forward, so I deleted the portion of this code that analyzed subset A to make the script run faster.

Preparing Data Subset B

data_subsetB <- clean_data_final %>% dplyr::select(-marine_sort_order, -latitude, -longitude, -mpa_region, -georegion, -site_code, -marine_common_season)

Data Splitting: Data Subset B

#write code that splits data into 70/30 train/test
#call the 2 parts data_train and data_test
library(caret)
set.seed(123)
trainset_subsetB <- caret::createDataPartition(y = data_subsetB$bioregion, p = 0.7, list = FALSE)
data_train_subsetB = data_subsetB[trainset_subsetB,] #extract observations/rows for training, assign to new variable
data_test_subsetB = data_subsetB[-trainset_subsetB,] #do the same for the test set

Null Model- Data Subset B

library("mlr")
null_accuracy_train_subsetB <- measureACC("SanFran_2_GovtPt", data_train_subsetB$bioregion)
null_accuracy_train_subsetB
## [1] 0.4258596

Single predictor models- Data Subset B

#There is probably a nicer tidyverse way of doing this. I just couldn't think of it, so did it this way.
set.seed(1111) #makes each code block reproducible
outcomename = "bioregion"
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5) #setting CV method for caret
Npred <- ncol(data_train_subsetB)-1 # number of predictors
resultmat <- data.frame(Variable = names(data_train_subsetB)[-1], Accuracy = rep(0,Npred)) #store performance for each variable
for (n in 2:ncol(data_train_subsetB)) #loop over each predictor. For this to work, outcome must be in 1st column
{
  fit1_subsetB <- caret::train(as.formula(paste(outcomename, "~",names(data_train_subsetB)[n])), data = data_train_subsetB, method = "rpart", trControl = fitControl, na.action = na.pass, tuneLength = 10) 
resultmat[n-1,2]= max(fit1_subsetB$results$Accuracy)  
}
print(resultmat)
##                 Variable  Accuracy
## 1             group_code 0.6853236
## 2     marine_season_code 0.6255127
## 3     marine_common_year 0.4434338
## 4        season_sequence 0.6970379
## 5            season_name 0.6970384
## 6            method_code 0.5139386
## 7           species_code 0.4583582
## 8               size_bin 0.4283541
## 9                  total 0.4440185
## 10                island 0.4861701
## 11 group_code_UCSC_other 0.6034279
## 12  method_code_IP_other 0.5112258
## 13                 state 0.6080911

Full model- Data Subset B

Now let’s fit a full logistic model with all predictors.

set.seed(1111) #makes each code block reproducible
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5) 
fit1_subsetB = caret::train(bioregion  ~ ., data=data_train_subsetB, method="rpart",  trControl = fitControl, na.action = na.pass, tuneLength = 10) 
print(fit1_subsetB$results)
##             cp  Accuracy     Kappa  AccuracySD     KappaSD
## 1  0.001448454 0.9559813 0.9353351 0.005539782 0.008196510
## 2  0.001511430 0.9558728 0.9351623 0.005431082 0.008033431
## 3  0.003211789 0.9541595 0.9326107 0.005192102 0.007676062
## 4  0.004723219 0.9522712 0.9298251 0.005571414 0.008246573
## 5  0.007746080 0.9489743 0.9249124 0.006593184 0.009792540
## 6  0.017286983 0.9352643 0.9041527 0.012654225 0.019132717
## 7  0.044020404 0.9124214 0.8700696 0.011311635 0.016670342
## 8  0.045342906 0.8879650 0.8317558 0.014722759 0.023893200
## 9  0.310409975 0.7822478 0.6516885 0.089881304 0.154096737
## 10 0.472321935 0.5865451 0.2998710 0.133975385 0.249913555
library(rpart.plot)
prp(fit1_subsetB$finalModel, extra = 1, type = 1)

Random forest

set.seed(1111) #makes each code block reproducible
tuning_grid <- expand.grid( .mtry = seq(1,7,by=1), .splitrule = "gini", .min.node.size = seq(2,8,by=1) )
fit2 = caret::train(bioregion ~ ., data=data_train_subsetB, method="ranger", trControl = fitControl, tuneGrid = tuning_grid, na.action = na.pass) 
plot(fit2)

Boosted tree ensemble

library(gbm)
## Warning: package 'gbm' was built under R version 3.5.2
## Loaded gbm 2.1.5
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), n.trees = 10, shrinkage = c(0.1, 0.01), n.minobsinnode = c(2,4,6))
fit3 = caret::train(bioregion ~ ., data=data_train_subsetB, method="gbm", trControl = fitControl, verbose=FALSE, tuneGrid = gbmGrid) 
plot(fit3)

Random forest with pre-processed predictors

# copy the random forest code from above. Add a statement to the train() function that centers and scales predictors.
# save the result as fit4. 
set.seed(1111) #makes each code block reproducible
tuning_grid <- expand.grid( .mtry = seq(1,7,by=1), .splitrule = "gini", .min.node.size = seq(2,8,by=1) )
fit4 = caret::train(bioregion ~ ., preProcess = c("center", "scale"), data=data_train_subsetB, method="ranger",  trControl = fitControl, tuneGrid = tuning_grid, na.action = na.pass) 
plot(fit4)

Discriminant analysis

#write code that trains a pda model, use tuneLength 20. Save as fit5 and plot.
set.seed(1111) #makes each code block reproducible
tuning_grid <- expand.grid(lambda = seq(0,1,by=0.05))
fit5 = caret::train(bioregion ~ ., data=data_train_subsetB, method="pda",  trControl = fitControl, tuneGrid = tuning_grid, na.action = na.pass, tuneLength = 20) 
## Warning: predictions failed for Fold1.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] : 
##   NAs are not allowed in subscripted assignments
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
#plot model results.
plot(fit5)

Comparing model performance

resamps <- resamples(list(tree = fit1,
                          RF1 = fit2,
                          GBM = fit3,
                          RF2 = fit4, 
                          PDA = fit5))
bwplot(resamps)

The column labeled “Accuracy” is the overall agreement rate averaged over cross-validation iterations. The agreement standard deviation is also calculated from the cross-validation results. The column “Kappa” is Cohen’s (unweighted) Kappa statistic averaged across the resampling results.

resamps_no_tree <- resamples(list(RF1 = fit2,
                          GBM = fit3,
                          RF2 = fit4, 
                          PDA = fit5))
bwplot(resamps_no_tree)

This suggests that while they are all pretty close, it seems like GBM (fit3) may be the best.

Evaluating the final model (boosted tree ensemble)

library('tidyr')
library('dplyr')
library('forcats')
library('ggplot2')
library('knitr')
library('caret')
#library('doParallel')
library('rpart')
library('rpart.plot')
library('mda')
## Loading required package: class
## Loaded mda 0.4-10
library('ranger')
## Warning: package 'ranger' was built under R version 3.5.2
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:rattle':
## 
##     importance
library('e1071')
## Warning: package 'e1071' was built under R version 3.5.2
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:mlr':
## 
##     impute

Generate confusion matrix on train data

#Write code to get model predictions for the outcome on the training data..
predicted <- predict(fit3, data_train_subsetB)
confusion(data_train_subsetB$bioregion, predicted) #generates confusion matrix
##                     true
## predicted            AK_2_BritColumb ChannelIsl_South GovtPt_2_Mexico
##   AK_2_BritColumb                257                0               0
##   ChannelIsl_South                 0               25               0
##   GovtPt_2_Mexico                  0                0            1656
##   OlyCstWA_2_SanFran               0                0               0
##   SalishSea_WA                     0                0               0
##   SanFran_2_GovtPt                 0                0               6
##                     true
## predicted            OlyCstWA_2_SanFran SalishSea_WA SanFran_2_GovtPt
##   AK_2_BritColumb                     0            0                0
##   ChannelIsl_South                    0            0                0
##   GovtPt_2_Mexico                     1            0                0
##   OlyCstWA_2_SanFran               2562            0              335
##   SalishSea_WA                        0          457                0
##   SanFran_2_GovtPt                   26            0             3894
# generating table with all results, with TRUE/FALSE accuracy column to compute ACC
train_data_results <- tibble("Actual" = data_train_subsetB$bioregion, "Predicted" = predicted)
train_accuracy <- measureACC(train_data_results$Predicted, train_data_results$Actual)
#use predicted and actual outcomes, make a table and compute accuracy.

Creating graphic to represent above results

library(knitr)
#install.packages("kableExtra")
library(kableExtra)
labels <- c("Predicted","AK_BrtClm","Salish_WA","OlyCstWA_SF","SF_GovtPt","Chnl_Isl_S","GovtPt_Mexico")
AK_2_BritColumb_col <- c("AK_BrtClm",257,0,0,0,0,0)
SalishSea_WA_col <- c("Salish_WA",0,457,0,0,0,0)
OlyCstWA_2_SanFran_col <- c("OlyCstWA_SF",0,0,2560,25,0,1)
SanFran_2_GovtPt <- c("SF_GovtPt",0,0,337,3895,0,0)
ChannelIsl_South_col <- c("Chnl_Isl_S",0,0,0,0,25,0)
GovtPt_2_Mexico_col <- c("GovtPt_Mexico",0,0,0,6,0,1656)
contingency_table <- data.frame(labels,AK_2_BritColumb_col,SalishSea_WA_col,OlyCstWA_2_SanFran_col,SanFran_2_GovtPt,ChannelIsl_South_col,GovtPt_2_Mexico_col)
colnames(contingency_table) <- c("","Actual","","","","","")
#Multipred_df$RMSE <- as.(Multipred_df$RMSE)
contingency_table
##                    Actual                                           
## 1     Predicted AK_BrtClm Salish_WA OlyCstWA_SF SF_GovtPt Chnl_Isl_S
## 2     AK_BrtClm       257         0           0         0          0
## 3     Salish_WA         0       457           0         0          0
## 4   OlyCstWA_SF         0         0        2560       337          0
## 5     SF_GovtPt         0         0          25      3895          0
## 6    Chnl_Isl_S         0         0           0         0         25
## 7 GovtPt_Mexico         0         0           1         0          0
##                
## 1 GovtPt_Mexico
## 2             0
## 3             0
## 4             0
## 5             6
## 6             0
## 7          1656
kable(contingency_table) %>%
  kable_styling(full_width = F)
Actual
Predicted AK_BrtClm Salish_WA OlyCstWA_SF SF_GovtPt Chnl_Isl_S GovtPt_Mexico
AK_BrtClm 257 0 0 0 0 0
Salish_WA 0 457 0 0 0 0
OlyCstWA_SF 0 0 2560 337 0 0
SF_GovtPt 0 0 25 3895 0 6
Chnl_Isl_S 0 0 0 0 25 0
GovtPt_Mexico 0 0 1 0 0 1656
#ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/12.PredvsActual.png",plot = kable(contingency_table), width = 5, height = 4)

Calculate null ACC on test data

null_accuracy_test_subsetB <- measureACC("SanFran_2_GovtPt", data_test_subsetB$bioregion)
null_accuracy_test_subsetB
## [1] 0.4262544

Generate confusion matrix on test data

#copy and paste the code from above, but now do it for the test set.
#Write code to get model predictions for the outcome on the training data..
predicted_final <- predict(fit3, data_test_subsetB)
confusion(data_test_subsetB$bioregion, predicted_final) #generates confusion matrix
##                     true
## predicted            AK_2_BritColumb ChannelIsl_South GovtPt_2_Mexico
##   AK_2_BritColumb                109                0               0
##   ChannelIsl_South                 0               10               0
##   GovtPt_2_Mexico                  0                0             706
##   OlyCstWA_2_SanFran               0                0               0
##   SalishSea_WA                     0                0               0
##   SanFran_2_GovtPt                 0                0               0
##                     true
## predicted            OlyCstWA_2_SanFran SalishSea_WA SanFran_2_GovtPt
##   AK_2_BritColumb                     0            0                0
##   ChannelIsl_South                    0            0                0
##   GovtPt_2_Mexico                     3            0                0
##   OlyCstWA_2_SanFran               1097            0              144
##   SalishSea_WA                        0          195                0
##   SanFran_2_GovtPt                   16            0             1666
# generating table with all results, with TRUE/FALSE accuracy column to compute ACC
test_data_results <- tibble("Actual" = data_test_subsetB$bioregion, "Predicted" = predicted_final)
test_accuracy <- measureACC(test_data_results$Predicted, test_data_results$Actual)
#use predicted and actual outcomes, make a table and compute accuracy.

Creating graphic to represent above results

library(knitr)
#install.packages("kableExtra")
library(kableExtra)
labels <- c("Predicted","AK_BrtClm","Salish_WA","OlyCstWA_SF","SF_GovtPt","Chnl_Isl_S","GovtPt_Mexico")
AK_2_BritColumb_col <- c("AK_BrtClm",109,0,0,0,0,0)
SalishSea_WA_col <- c("Salish_WA",0,195,0,0,0,0)
OlyCstWA_2_SanFran_col <- c("OlyCstWA_SF",0,0,1096,14,0,3)
SanFran_2_GovtPt <- c("SF_GovtPt",0,0,145,1668,0,0)
ChannelIsl_South_col <- c("Chnl_Isl_S",0,0,0,0,10,0)
GovtPt_2_Mexico_col <- c("GovtPt_Mexico",0,0,0,0,0,706)

contingency_table2 <- data.frame(labels,AK_2_BritColumb_col,SalishSea_WA_col,OlyCstWA_2_SanFran_col,SanFran_2_GovtPt,ChannelIsl_South_col,GovtPt_2_Mexico_col)
colnames(contingency_table) <- c("","Actual","","","","","")
contingency_table2
##          labels AK_2_BritColumb_col SalishSea_WA_col
## 1     Predicted           AK_BrtClm        Salish_WA
## 2     AK_BrtClm                 109                0
## 3     Salish_WA                   0              195
## 4   OlyCstWA_SF                   0                0
## 5     SF_GovtPt                   0                0
## 6    Chnl_Isl_S                   0                0
## 7 GovtPt_Mexico                   0                0
##   OlyCstWA_2_SanFran_col SanFran_2_GovtPt ChannelIsl_South_col
## 1            OlyCstWA_SF        SF_GovtPt           Chnl_Isl_S
## 2                      0                0                    0
## 3                      0                0                    0
## 4                   1096              145                    0
## 5                     14             1668                    0
## 6                      0                0                   10
## 7                      3                0                    0
##   GovtPt_2_Mexico_col
## 1       GovtPt_Mexico
## 2                   0
## 3                   0
## 4                   0
## 5                   0
## 6                   0
## 7                 706
kable(contingency_table) %>%
  kable_styling(full_width = F)
Actual
Predicted AK_BrtClm Salish_WA OlyCstWA_SF SF_GovtPt Chnl_Isl_S GovtPt_Mexico
AK_BrtClm 257 0 0 0 0 0
Salish_WA 0 457 0 0 0 0
OlyCstWA_SF 0 0 2560 337 0 0
SF_GovtPt 0 0 25 3895 0 6
Chnl_Isl_S 0 0 0 0 25 0
GovtPt_Mexico 0 0 1 0 0 1656

Creating graphic to represent above results

null_accuracy_test_subsetB<- round(null_accuracy_test_subsetB,4)
null_accuracy_train_subsetB<- round(null_accuracy_train_subsetB,4)
train_accuracy<- round(train_accuracy,4)
test_accuracy <- round(test_accuracy,4)

Data_cat <- c("Null_train","GBMmodel_train","Null_test","GBMmodel_test")
accuracy_cat <- c(null_accuracy_train_subsetB,train_accuracy,null_accuracy_test_subsetB,test_accuracy)
Final_Results_cat <- data.frame(Data_cat,accuracy_cat)
Final_Results_cat
##         Data_cat accuracy_cat
## 1     Null_train       0.4259
## 2 GBMmodel_train       0.9601
## 3      Null_test       0.4263
## 4  GBMmodel_test       0.9587
Final_Results_cat_ordered <- Final_Results_cat %>%
  mutate(Data_cat = factor(Data_cat, levels = c("Null_train","GBMmodel_train","Null_test","GBMmodel_test"))) %>%
  arrange(Data_cat) 

Final_Results_cat_ordered_plot <- Final_Results_cat_ordered %>% ggplot(aes(x = Data_cat, y=accuracy_cat, label=accuracy_cat)) +geom_label(stat='identity',fill=RMSE_colors, size=9, face="bold") + ylab("Accuracy (%)") + ylim(0.3,1.0) + 
  theme(title = element_text(size=12), axis.title.y = element_text(size=16),axis.title.x = element_blank(),axis.text.y = element_text(size=12), axis.text.x = element_text(size=12))
## Warning: Ignoring unknown parameters: face
Final_Results_cat_ordered_plot

ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/14.final_results_cat.png",plot = Final_Results_cat_ordered_plot, width = 5, height = 4)

Accuracy went down, but only by ~0.1%, and its still very high.